# -*- coding:utf-8 -*-
"""
get simple sentence after the but
"""
import simplejson
from gdsettings import GD_DIR
from LyricStructure.pylyric.Lyric import Lyric
from commontool.but_pattern import check_have_but_line,get_but_pos

def get_but_simple_sentence():
    mid_list = simplejson.load(open("%s/data/vote_split/train%d.json"%(GD_DIR,0)))+\
            simplejson.load(open("%s/data/vote_split/test%d.json"%(GD_DIR,0)))

    line_dic = {}
    for mid in mid_list:
        the_str = open("/Users/wangxing/gd/lyrics/%d.lrc"%(mid)).read()
        l = Lyric()
        l.loadstr(the_str.decode('gbk'))
        linelist = l.getlinelist()

        for line_item in linelist:

            line = line_item['content']
            # get the but
            but_count = check_have_but_line(line)
            if but_count != 1:
                continue

            but_pos = get_but_pos(line)
            simple_sen = str(mid)+':'+line[but_pos+1:]
            line_dic[simple_sen] = 1

    outf = open("but_line_list.txt","w")
    for line in line_dic.keys():
        print>>outf,line.encode('utf-8')
    outf.close()


if __name__ == "__main__":
    get_but_simple_sentence()
